Introduction

Column 1

Bloomington

Column 2

Chicago

Column 3

Marin County

Data Issues

Identify

Plan

Resolve

Sample SQL to check for duplicate names

select names, count(*) as num_rows
from bloomington
group by names
having num_rows > 1

Sample SQL to remove duplicate names

select names, departments, max(salaries) as salaries
from bloomington
group by names, departments

Sample SQL to create column labeling part time or full time

select names 
      ,departments
      ,salaries
      ,case when salaries < 25000 then 'Part Time'
            else 'Full Time' end as emp_type
from bloomington

Verify

Analyze

Estimate

#filter the chicago data onto only full time employees
temp <- chicago %>% filter(emp_type == 'Full Time')

#fit a simple linear model with no intercept
glm_fit <- lm(salaries ~ 0 + departments, data=temp)

#get unique values of the departments
depts <- tibble(levels(chicago$departments))
colnames(depts) = c('departments')

#predict the mean and 95% prediction interval for each dept
preds <- data.frame(predict(glm_fit, newdata=depts, interval = "prediction"))

#combine back with unique list of depts
depts <- cbind(depts, preds)

#join back into main dataset
chic_final <- left_join(chicago, depts, by = c('departments')) %>% 
  filter(emp_type == 'Full Time')

SQL for the Join

select chi.*
      ,pred.lwr
      ,pred.upper
from chicago as chi
left join predictions as pred
on chi.departments = pred.departments

Plot the Results

Conclusion